library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(stats)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(glue)
library(imputeTS)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rlang':
## 
##     flatten, unbox
library(tidyr)
library(zoo)
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.92 loaded
library(readr)
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:rlang':
## 
##     set_names

Exercise 1

Task a)

# Load the dataset
traffic_data <- read.csv("traffic_volume.csv")

# Select the columns "date_time" and "traffic_volume"
traffic_data <- traffic_data %>%
  select(date_time, traffic_volume)

# Convert "date_time" to POSIXct format
traffic_data$date_time <- ymd_hms(traffic_data$date_time)
# Remove incomplete calendar years (2012 and 2018)
traffic_data <- traffic_data %>%
  filter(!(year(date_time) %in% c(2012, 2018)))

# Printing the traffic volume for the first and last year.
year_2013_data <- traffic_data %>%
  filter(year(date_time) == 2013)

year_2017_data <- traffic_data %>%
  filter(year(date_time) == 2017)

# Print the traffic volume for the first and last year
cat("2013: ",head(year_2013_data$traffic_volume, 5),"\n")
## 2013:  1439 1502 933 576 372
cat("2017: ",head(year_2017_data$traffic_volume, 5))
## 2017:  1848 1806 1211 794 500
# Remove the 29th of February in leap years
traffic_data <- traffic_data %>%
  filter(!(month(date_time) == 2 & day(date_time) == 29))

# Verify that the number of rows for each calendar year is 365
year_counts <- traffic_data %>%
  group_by(year(date_time))%>%
  summarise(row_count = n()/ (24))
year_counts
# Plot the data with the whole data on the x-axis
ggplot(traffic_data, aes(x = date_time, y = traffic_volume)) +
  geom_line() +
  scale_x_datetime(date_breaks = "1 year", date_labels = "%b %Y") +
  labs(x = "Date", y = "Traffic Volume") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle(glue("Plotting the whole data"))

# Plot the data with the all 2013 data on the x-axis
ggplot(year_2013_data, aes(x = date_time, y = traffic_volume)) +
  geom_line() +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%b %Y") +
  labs(x = "Date", y = "Traffic Volume") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+ 
  ggtitle(glue("Plotting all of 2013 data"))

# Plot the data of February in 2013 on the x-axis
data = year_2013_data %>%
  filter((month(date_time) == 2))

ggplot(data, aes(x = date_time, y = traffic_volume)) +
  geom_line() +
  scale_x_datetime(date_breaks = "1 day", date_labels = "%b %Y") +
  labs(x = "Date", y = "Traffic Volume") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+ 
  ggtitle(glue("Plotting february in 2013"))

Task b)

# Aggregate data at daily level
daily_aggregated <- traffic_data %>%
  group_by(date = date(date_time)) %>%
  summarise(mean_volume = mean(traffic_volume),
            std_dev_volume = sd(traffic_volume),
            min_volume = min(traffic_volume),
            max_volume = max(traffic_volume))

# Aggregate data at weekly level
weekly_aggregated <- traffic_data %>%
  group_by(week = lubridate::floor_date(date_time, "week")) %>%
  summarise(mean_volume = mean(traffic_volume),
            std_dev_volume = sd(traffic_volume),
            min_volume = min(traffic_volume),
            max_volume = max(traffic_volume))

# Aggregate data at monthly level
monthly_aggregated <- traffic_data %>%
  group_by(month = lubridate::floor_date(date_time, "month")) %>%
  summarise(mean_volume = mean(traffic_volume),
            std_dev_volume = sd(traffic_volume),
            min_volume = min(traffic_volume),
            max_volume = max(traffic_volume))

# Aggregate data at yearly level
yearly_aggregated <- traffic_data %>%
  group_by(year = year(date_time)) %>%
  summarise(mean_volume = mean(traffic_volume),
            std_dev_volume = sd(traffic_volume),
            min_volume = min(traffic_volume),
            max_volume = max(traffic_volume))

# Function to create a line plot with standard deviations, min, and max
plot_with_stats <- function(data, x_var, y_var, title) {
  ggplot(data, aes(x = !!sym(x_var), y = !!sym(y_var))) +
    geom_line(aes(color = "Mean")) +
    geom_ribbon(aes(ymin = !!sym(y_var) - std_dev_volume, ymax = !!sym(y_var) + std_dev_volume, fill = "Std. Dev."), alpha = 0.2) +
    geom_line(aes(y = min_volume, color = "Min")) +
    geom_line(aes(y = max_volume, color = "Max")) +
    labs(x = x_var, y = y_var, title = title) +
    theme_minimal() +
    scale_color_manual(values = c("Mean" = "black", "Min" = "blue", "Max" = "red")) +
    scale_fill_manual(values = c("Std. Dev." = "gray"))
}

# Create and display plots
plot_with_stats(daily_aggregated, "date", "mean_volume", "Daily Traffic Volume")

plot_with_stats(weekly_aggregated, "week", "mean_volume", "Weekly Traffic Volume")

plot_with_stats(monthly_aggregated, "month", "mean_volume", "Monthly Traffic Volume")

plot_with_stats(yearly_aggregated, "year", "mean_volume", "Yearly Traffic Volume")

Task c)

# Define a vector of frequencies
frequencies <- c(7, 24, 24*7, 24*12, 24*24)
freq_hour <- c(24*7, 24*12, 24*365)


# Create a function for STL decomposition and plotting
perform_stl_and_plot <- function(data, interval, frequency, sub_interval = NULL) {
  if (identical(data, traffic_data)) {
    stl_result <- stl(ts(data$traffic_volume, frequency = frequency), s.window = "periodic")
  } else {
    stl_result <- stl(ts(data$mean_volume, frequency = frequency), s.window = 'periodic')
  }

  # Plot the decompositions
  plot(stl_result, main = paste("STL Decomposition -", interval, "-", frequency))
  
}

# hourly aggregated data 
for (i in freq_hour){
perform_stl_and_plot(traffic_data, "Hourly",frequency = i)}

# Daily aggregated data
for (i in frequencies){
perform_stl_and_plot(daily_aggregated, "Daily", frequency = i)}

# Weekly aggregated data
for (i in frequencies[1:2]){
perform_stl_and_plot(weekly_aggregated, "Weekly", frequency = i)}

# Monthly aggregated data
for (i in frequencies[1:2]){
perform_stl_and_plot(monthly_aggregated, "Monthly", frequency = i)}

Task d)

What is the interpretation of the seasonal components in the data?

We can see that the seasonal component consists of three main parts: seasonality, trend, and remainder. When we combine these three components, they make up the entire dataset. The seasonality specifically depicts the fluctuations in the data over different time periods, revealing a consistent pattern in the data. The trend represents the overall movement or direction of the data, showing consistent upward and downward trends. The remainder is calculated by subtracting the values of the seasonal and trend components from the time series. In our case, we observe that the remainder contains a consistent level of noise, except for the middle portion where the noise is smaller than in the other areas.

Does it make sense to aggregate by mean? Would median be more reasonable?

When we aggregate by mean our data can be hugely effected on the outlier in the data. Outliers that has very high values or very low. Median would be more reasonable in the consideration to extreme values (outliers).

Does simple STL decomposition make sense in the case of multiple seasons?

Simple STL decomposition is designed to handle time series data with a single dominant season. It may not be the best choice for data with multiple seasonal patterns, as it primarily focuses on extracting a single season and trend. When you have multiple seasonal components or complex seasonality, it makes
sense to use a more advanced decomposition method.

Exercise 2

Task a)

Missing values

# load dataset
rom = read.csv('romanian_energy.csv')

# convert the DateTime column to the appropriate time series object
rom <- rom %>%
  mutate(DateTime = ymd_hms(DateTime))


ggplot(rom, aes(x = DateTime, y = Consumption.Missing)) + geom_line()

ggplot_na_gapsize(rom$Consumption.Missing)

ggplot_na_distribution(rom$Consumption.Missing)

## Task a)

ACF

ts_data <- ts(rom$Consumption.Full, frequency = 1)

# Create a vector of different lag.max values to explore
lag_max_values <- c(100, 1000, 36000)

# Create subplots for ACF with different lag.max values
par(mfrow=c(1, length(lag_max_values)))

for (lag_max in lag_max_values) {
  acf(ts_data, lag.max = lag_max, main = paste("ACF with lag.max = ", lag_max))
}

comment on the seasonal components.

We can see that the seasonal periods are daily, weekly and yearly.

Task b)

# Compute the MSE between your fit and the ground truth
MSE <- function(y_true, y_pred){
    return( 
      mean((y_true - y_pred)^2)
    )
}

# mutate global, locf and seadec
df_imp = rom %>% 
  mutate(global = na_mean(Consumption.Missing), 
         locf = na_locf(Consumption.Missing),
         seadec = na_seadec(ts(Consumption.Missing,
                               frequency = 365), algorithm = "interpolation")
         )

imp <- c('global', 'locf', 'seadec')
mse <- c()

for (i in 1:length(imp)){
  mse[imp[i]]<- MSE(df_imp$Consumption.Full, df_imp[[imp[i]]])}

plot_imputation <- function(imp_var){
  p <- ggplot_na_imputations(x_with_na = df_imp$Consumption.Missing, 
                      x_with_imputations = df_imp[[imp_var]],
                      x_with_truth = df_imp$Consumption.Full,
                      size_points = 0.4,
                      size_truth = 0.4,
                      size_lines = 0.5,
                      size_imputations = 1.02
                      ) +
    ggtitle(glue("Imputation of {imp_var}"))
  print(p)
}

for (i in 1:length(imp)){
plot_imputation(imp[i])}

cat("MSE","\n")
## MSE
mse
##   global     locf   seadec 
## 156227.4 482024.2 302835.6

Do you see any problems with the imputations?

Yes, there are some issues. The global method does not capture the underlying trend or seasonality but merely provides the mean value, which may not accurately represent the data. Locf takes the last available value and carries it forward through the gap, which can lead to poor performance in cases with large gaps. Seadec is an improvement over locf, but it still deviates from the ground truth. Overall, the “global” method performs best when considering the mean squared error values.

Task c)

# Example seasonal periods, replace with your findings
seasonal_periods <- c(24, 24*7,24*365)

# Convert 'Consumption.Full' into a multiple seasonal time series object using 'msts'
ts_full <- msts(rom$Consumption.Full, seasonal.periods = seasonal_periods)

# Decompose the time series using 'mstl'
decomposition_full <- mstl(ts_full) 

autoplot(decomposition_full, main = "Decomposition of Consumption.Full")

# Convert 'Consumption.Missing' into a multiple seasonal time series object using 'msts'
ts_missing <- msts(rom$Consumption.Missing, seasonal.periods = seasonal_periods)

# Decompose the time series using 'mstl'
decomposition_missing <- mstl(ts_missing)

miss = decomposition_missing

# Replace NA values with NaN in the decomposition components
miss[is.na(miss)] <- NaN

# Check for rows containing at least one NaN value
rows_with_na <- apply(is.na(miss), 1, any)

# Replace entire rows with NaN if at least one NaN is found in the row
miss[rows_with_na, ] <- NaN

# Plot both decompositions
autoplot(miss, main="Decomposition of Consumption.Missing" )

# Take into account all reasonable seasonal periods detected in (a)

Task d)

# creating a function to merge relevant columns together 
merging_miss_full <- function(name, number){
  df = data.frame(cbind("full"=decomposition_full[ , number]
                        , "miss"=decomposition_missing[, number]))
return(df)
}

# running the function for the different columns 
trend = merging_miss_full("trend", 2)
seas_24 = merging_miss_full("seasonal24", 3)
seas_168 = merging_miss_full("seasonal168", 4)
seas_8760 = merging_miss_full("seasonal8760", 5)
remainder = merging_miss_full("remainder", 6)

# Initialize a vector to store MSE values for each component
mse_values <- c()

# creating a imputing function 
imputing <- function(df){
  df_1 <- subset(df, select = -1)
  df_1 <- df_1 %>%
    mutate(imp =na_seasplit(ts(df, frequency = 365),
                                    algorithm = "interpolation",
                                    find_frequency = FALSE))
  
  return(df_1)
}

# running the function for the different columns 
trend_imp = imputing(trend)
seas_24_imp = imputing(seas_24)
seas_168_imp = imputing(seas_168)
seas_8760_imp = imputing(seas_8760)
remainder_imp = imputing(remainder)

# Calculating the mse for the different columns 
mse_values["trend"] <- MSE(trend$full, trend_imp$imp)
mse_values["seas_24"] <- MSE(seas_24$full, seas_24_imp$imp)
mse_values["seas_168"] <- MSE(seas_168$full, seas_168_imp$imp)
mse_values["seas_8760"] <- MSE(seas_8760$full, seas_8760_imp$imp)
mse_values["remainder"] <- MSE(remainder$full, remainder_imp$imp)

cat("MSE","\n")
## MSE
mse_values
##     trend   seas_24  seas_168 seas_8760 remainder 
##  820.9013 4093.5715  549.8578 1742.7732 6727.5536

Exercise 3

Task a)

# Define a function to import JSON files and return a data frame
import_json <- function(file) {
  json_data <- fromJSON(file)
  df <- as.data.frame(json_data)
  return(df)
}

# Load each JSON file
world_df <- import_json("surface_temp/world.json")
tropics_df <- import_json("surface_temp/tropics.json")
sh_df <- import_json("surface_temp/sh.json")
nh_df <- import_json("surface_temp/nh.json")
arctic_df <- import_json("surface_temp/arctic.json")
antarctic_df <- import_json("surface_temp/antarctic.json")

# Explore the head and tail of the data frame
head(world_df, 3)
tail(world_df, 5)

Investigate the head and tail of the data. Is there anything that can be removed?

We can see the tail of the json files can be removed, the last three rows. These last 3 rows are not are not completely necessary for the data frame.

# Define a function to preprocess a dataframe
preprocess_one <- function(df) {
  
  # Unnest the "data" column to get the temperature values
  df <- df %>%
    unnest(cols = c(data)) %>%
    rename(temperature = data)
  
  # Create a "date" column with date values for each day of the year
  df$date <- as.Date(paste("1979", 1, 1, sep = "-")) + (seq_along(df$temperature) - 1)
  
  # Remove the "name" column
  df <- df[, !(names(df) %in% c("name"))]
  
  # Filter the data to keep rows between 1979 and 2022
  df <- df[df$date >= as.Date("1979-01-01") & df$date <= as.Date("2022-12-31"), ]
  
  # Reorder columns with "date" on the left side
  df <- df[, c("date", setdiff(names(df), "date"))]
  
  # Define a function to remove February 29th (year-02-29) from the data frame
  df <- df[!(format(df$date, "%m-%d") == "02-29"), ]
  
  return(df)
}

# Preprocess all the data frames
preprocessed_world_df <- preprocess_one(world_df)
preprocessed_tropics_df <- preprocess_one(tropics_df)
preprocessed_sh_df <- preprocess_one(sh_df)
preprocessed_nh_df <- preprocess_one(nh_df)
preprocessed_arctic_df <- preprocess_one(arctic_df)
preprocessed_antarctic_df <- preprocess_one(antarctic_df)

# Print the head and tail of one of the preprocessed data frames (e.g., preprocessed_world_df)
print(head(preprocessed_world_df, 5))
## # A tibble: 5 × 2
##   date       temperature
##   <date>           <dbl>
## 1 1979-01-01        12.3
## 2 1979-01-02        12.3
## 3 1979-01-03        12.3
## 4 1979-01-04        12.2
## 5 1979-01-05        12.3
print(tail(preprocessed_world_df, 5))
## # A tibble: 5 × 2
##   date       temperature
##   <date>           <dbl>
## 1 2022-12-27        13.5
## 2 2022-12-28        13.5
## 3 2022-12-29        13.4
## 4 2022-12-30        13.4
## 5 2022-12-31        13.3
# Define a function to merge the preprocessed dataframes
merge_dataset <- function(world, tropics, sh, nh, arctic, antarctic) {
  # Merge the dataframes sequentially based on the "date" column
  df_merged <- merge(world, tropics, by = "date", all = TRUE)
  df_merged <- merge(df_merged, sh, by = "date", all = TRUE)
  df_merged <- merge(df_merged, nh, by = "date", all = TRUE)
  df_merged <- merge(df_merged, arctic, by = "date", all = TRUE)
  df_merged <- merge(df_merged, antarctic, by = "date", all = TRUE)
  
  # Rename the columns to match the specified names
  colnames(df_merged) <- c("date", "world", "tropics", "sh", "nh", "arctic", "antarctic")
  
  # Return the merged dataframe
  return(df_merged)
}

# Call the merge_dataset function with the preprocessed dataframes
df_1 <- merge_dataset(preprocessed_world_df, preprocessed_tropics_df, preprocessed_sh_df,
                      preprocessed_nh_df, preprocessed_arctic_df, preprocessed_antarctic_df)
## Warning in merge.data.frame(df_merged, nh, by = "date", all = TRUE): column
## names 'temperature.x', 'temperature.y' are duplicated in the result
## Warning in merge.data.frame(df_merged, arctic, by = "date", all = TRUE): column
## names 'temperature.x', 'temperature.y' are duplicated in the result
## Warning in merge.data.frame(df_merged, antarctic, by = "date", all = TRUE):
## column names 'temperature.x', 'temperature.y', 'temperature.x', 'temperature.y'
## are duplicated in the result
df_imputed <- df_1
df_imputed[, -1] <- na.approx(df_imputed[, -1])


# Print the head of df_1 to check the merged dataframe
head(df_1)
# Extract the year from the date column and 
# add it as a new column to the dataframe

dfplot_1 = df_imputed # Made a copy

dfplot_1$year <- format(dfplot_1$date, "%Y")

# Plot the data for each region individually, colored by year
ggplot(dfplot_1, aes(x = yday(date), y = world, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'World', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

ggplot(dfplot_1, aes(x = yday(date), y = tropics, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'Tropics', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

ggplot(dfplot_1, aes(x = yday(date), y = sh, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'Southern Hemisphere', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

ggplot(dfplot_1, aes(x = yday(date), y = nh, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'Northern Hemisphere', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

ggplot(dfplot_1, aes(x = yday(date), y = arctic, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'Arctic', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

ggplot(dfplot_1, aes(x = yday(date), y = antarctic, color = as.factor(year))) +
  geom_line(alpha = 0.5, linewidth = 0.5) +
  labs(title = 'Antarctic', x = 'Day of the Year',
       y = 'Temperature', color = 'Year') +
  theme_minimal()

# Compute correlation matrix
cor_matrix <- cor(df_imputed[, -1],
                  use = "complete.obs")  # Exclude the date column

# Generate correlation plot
corrplot(cor_matrix, method = "number", type = "upper", 
         order = "hclust", tl.col = "black", tl.srt = 45)

# Perform PCA
pca <- prcomp(df_imputed[, -1], scale. = TRUE,
              na.action = na.omit)  # Exclude the date column
## Warning: In prcomp.default(df_imputed[, -1], scale. = TRUE, na.action = na.omit) :
##  extra argument 'na.action' will be disregarded
# Plot explained variances
screeplot(pca, type = "l", col = "blue",
          main = "Explained Variances by Principal Components")

# PCA Scores Plot using Base R
biplot(pca, col = c("red", "blue"), cex = 0.75, main = "PCA Biplot")

# Extracting scores
scores <- as.data.frame(pca$x)

# Plotting scores
ggplot(scores, aes(x = PC1, y = PC2)) +
  geom_point(aes(color = PC1), size = 3) +
  labs(title = "PCA Scores Plot", x = "PC1", y = "PC2") +
  theme_minimal()

# Print the summary of the PCA
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5       PC6
## Standard deviation     2.1756 1.0300 0.38231 0.20983 0.12410 0.0004078
## Proportion of Variance 0.7889 0.1768 0.02436 0.00734 0.00257 0.0000000
## Cumulative Proportion  0.7889 0.9657 0.99010 0.99743 1.00000 1.0000000

Task d)

How do temperature patterns in the regions differ for each other?

The temperature patterns are contrasted between the regions antarctic & sh and arctic, nh & world. These two parts are inverse to one another. This makes sense, because the seasons differ between the regions. The tropics do not have a strong correlation with the other regions. The temperature pattern for world is highly affected by the other regions as arctic and nh, therefore its in the right side of the PC1.

How can the relationships be identified in the given plots?

From the PCA biplot, we can see that regions as sh & antarctic and arctic & nh are alike and also contrast each other. While the tropics are not correlated to either of the other regions. The PCA scores plot shows us the variance of the PC1 with consideration to the PC2.

Exercise 4

Task a)

Parameter range considerations:

\(s.window\): This parameter should be chosen by considering the nature of the seasonal pattern in the data, ensuring it’s large enough to smooth out noise but not blur distinctions between different periods.

\(t.window\): It needs to be large enough to filter out cyclical and irregular fluctuations, but not so large that it lacks responsiveness to genuine changes in the trend. Typically, a minimum of 3 is used.

The requirement for \(s.window\) and \(t.window\) to be odd and greater than 0 is primarily due to the way moving averages and local smoothing are applied to estimate the seasonal and trend components. Using an odd window size ensures that the local average is symmetrically calculated around the point of interest. For example, if \(s.window = 3\), one observation before and one after the point of interest are used for averaging, maintaining symmetry.

\(s.window\) and \(t.window\) represent the number of periods used for local averaging, and they must be positive integers to make logical sense. A window size of 0 or negative would not allow for any meaningful computation of averages or smoothing.

Computing the different components of the task:

compute_components <- function(xt, p, s_window, t_window) {
  n <- length(xt)
  st <- numeric(n)  # Seasonal component
  tau_t <- numeric(n)  # Trend component
  rt <- numeric(n)  # Remainder component
  w <- (s_window - 1) / 2
  v <- (t_window - 1) / 2

  # Seasonal component
  for (t in 1:n) {
    Wt <- ((t - w * p):(t + w * p)) %% n + 1  
    st[t] <- mean(xt[Wt], na.rm = TRUE)
  }

  yt <- xt - st

  # Trend component
  for (t in 1:n) {
    Vt <- ((t - v):(t + v)) %% n + 1 
    tau_t[t] <- mean(yt[Vt], na.rm = TRUE)
  }

  # Remainder component
  rt <- yt - tau_t

  return(list(seasonal = st, trend = tau_t, remainder = rt))
}
compute_components <- function(xt, p, s_window, t_window) {
  n <- length(xt)
  st <- numeric(n)  # Seasonal component
  tau_t <- numeric(n)  # Trend component
  rt <- numeric(n)  # Remainder component
  w <- (s_window - 1) / 2
  v <- (t_window - 1) / 2

  # Seasonal component
  for (t in 1:n) {
    Wt <- seq(max(1, t - w * p), min(n, t + w * p))
    st[t] <- mean(xt[Wt], na.rm = TRUE)
  }

  yt <- xt - st

  # Trend component
  for (t in 1:n) {
    Vt <- seq(max(1, t - v), min(n, t + v))
    tau_t[t] <- mean(yt[Vt], na.rm = TRUE)
  }

  # Remainder component
  rt <- yt - tau_t

  return(list(seasonal = st, trend = tau_t, remainder = rt))
}

Task b)

# Define a function to perform grid search for optimizing s.window and t.window
optimize_parameters <- function(xt, p, s_window_range, t_window_range) {
  best_params <- list(s_window = NA, t_window = NA)
  min_var <- Inf  # Initialize minimum variance as infinity

  # Grid search
  for (s_w in s_window_range) {
    for (t_w in t_window_range) {
      components <- compute_components(xt, p, s_w, t_w)
      var_remainder <- var(components$remainder, na.rm = TRUE)

      # Update best parameters if current variance is lower
      if (var_remainder < min_var) {
        min_var <- var_remainder
        best_params$s_window <- s_w
        best_params$t_window <- t_w
      }
    }
  }

  return(best_params)
}
plot_decomposition <- function(xt, components, main_title) {
  time_index <- time(xt)
  data <- data.frame(Time = 1:length(xt), 
                     Original = as.numeric(xt), 
                     Seasonal = components$seasonal, 
                     Trend = components$trend, 
                     Remainder = components$remainder)
  
  p1 <- ggplot(data, aes(x = Time, y = Original)) +
    geom_line() +
    ggtitle(paste(main_title, '- Original Time Series')) +
    theme_minimal()
  
  p2 <- ggplot(data, aes(x = Time, y = Seasonal)) +
    geom_line() +
    ggtitle('Seasonal Component') +
    theme_minimal()
  
  p3 <- ggplot(data, aes(x = Time, y = Trend)) +
    geom_line() +
    ggtitle('Trend Component') +
    theme_minimal()
  
  p4 <- ggplot(data, aes(x = Time, y = Remainder)) +
    geom_line() +
    ggtitle('Remainder Component') +
    theme_minimal()
  
  # Display the plots
  gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 1)
}
# Parameter ranges for grid search
s_window_range <- seq(3, 35, 2)  
t_window_range <- seq(3, 35, 2)  

# Optimize parameters and plot decomposition for each dataset
datasets <- list(austres = austres, co2 = co2, nottem = nottem)
p_values <- list(austres = 4, co2 = 12, nottem = 12) 

for (data_name in names(datasets)) {
  xt <- datasets[[data_name]]
  p <- p_values[[data_name]]

  best_params <- optimize_parameters(xt, p, s_window_range, t_window_range)
  components <- compute_components(xt, p, best_params$s_window, best_params$t_window)
  plot_decomposition(xt, components, paste(data_name, '- Optimal Decomposition'))
}
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

Task c)

# Define a function to plot the autocorrelation of the original dataset and each component
plot_autocorrelation <- function(xt, components, title) {

  # Original Series
  acf(xt, main = paste(title, '- ACF of Original Series'))

  # Seasonal Component
  acf(components$seasonal, main = paste(title, ' - ACF of Seasonal Component'))

  # Trend Component
  acf(components$trend, main = paste(title, ' - ACF of Trend Component'))

  # Remainder Component
  acf(components$remainder, main = paste(title, ' - ACF of Remainder Component'))
}

# Plot autocorrelation for each dataset using the previously computed components
for (data_name in names(datasets)) {
  xt <- datasets[[data_name]]
  p <- p_values[[data_name]]

  # Optimize parameters
  best_params <- optimize_parameters(xt, p, s_window_range, t_window_range)

  # Compute components using optimized parameters
  components <- compute_components(xt, p, best_params$s_window, best_params$t_window)

  # Plot autocorrelation
  plot_autocorrelation(xt, components, data_name)
}

Regarding which component is the most dominant for every dataset:

Austres: The seasonal component is most dominant in the dataset, and appears to have a high correlation to the acf of the original data. The trend component has strongest correlations to the original time series at low values of lag, while the remainder component doesn’t appear to explain much of the variance in the dataset.

Co2: The seasonal component has a high correlation with the original acf for most levels of lag. The trend component and the remainder component is less dominant or this dataset. Correlation levels for these two components have a cyclical distribution for higher levels of lag.

Nottem: The distribution for the trend component and the remainder component appears to be similar to the orginial acf for this dataset. The seasonal component appears to be less dominant for this dataset ## Task d)

1. Smoothing

  • Reduces noise and highlights longer-term patterns or trends in the data. Smoothing helps to reveal underlying patterns (like trends or seasonality) by reducing the impact of short-term fluctuations or noise. It can be useful when the data is noisy, as it helps to reveal the underlying patterns in the data. However, excessive smoothing might remove genuine patterns, so it should be applied judiciously. In the context of the overall procedure in the exercise, it might be useful to reduce the noise in the remainder component, making the seasonal and trend components clearer.

2. Differencing

  • Removes trends and seasonality, making the series more stationary. Differencing is used to stabilize the mean of a time series by removing changes in the level of a time series, and therefore, eliminating (or reducing) trend and seasonality. It’s crucial when dealing with non-stationary data. Since many statistical methods assume stationarity, differencing can be a useful preprocessing step. However, it might not always be necessary and should be applied based on the characteristics of the data. Regarding this exercise, differencing might be useful if the original series is non-stationary and we want to stabilize the mean to make further analysis valid.

3. Standardization

  • Transforms the data to have zero mean and unit variance. Standardization makes the series scale-independent and is particularly useful when comparing measurements that have different units or when comparing the variability of different series. It may be useful when comparing different time series or when the algorithm is sensitive to the scale of the data. For decomposition, where we are interested in the original scale for interpreting the trend, standardization might not be desirable.